Differential expression using the bulk RNAseq JAX_RNAseq_ExtraEmbryonic data from the December 2023 release.

There are two model systems involved in this dataset, ExM & PrS.

There are also both normoxia and hypoxia samples involved in this dataset. Need to create another column for this information.

There is only one day value.

R libraries.

library(ggplot2)
library(ggrepel)
library(ggpubr)
library(stringr)
library(MASS)
library(RColorBrewer)
library(DESeq2)

library(viridis)
library(ggpointdensity)
library(dplyr)
library(data.table)
library(ComplexHeatmap)
library(UpSetR)
library(readxl)
theme_set(theme_classic())

dat_path = "../../MorPhiC/data/December-2023/JAX_RNAseq_ExtraEmbryonic/"
dat_path = paste0(dat_path, "processed/Tables")

Read in meta data

meta = fread("data/December_2023/JAX_RNAseq_ExtraEmbryonic_meta_data.tsv", 
             data.table = FALSE)
dim(meta)
## [1] 90 36
meta[1:2,]
##    clonal.label library_preparation.label             label
## 1   MOK20010-WT                GT23-10491   PrS-MOK20010-WT
## 2 MOK20010C-A06                GT23-10506 PrS-MOK20010C-A06
##                                                                            description
## 1                                                  KOLF2.2 derived primitive syncytium
## 2 KOLF2.2 deleted POU2F3 by deletion of critical exon (CE) derived primitive syncytium
##   differentiated_product_protocol_id model_system timepoint_value
## 1                           JAXPD001          PrS               6
## 2                           JAXPD001          PrS               6
##   timepoint_unit final_timepoint treatment_condition wt_control_status
## 1           days             Yes             hypoxia                WT
## 2           days             Yes      Not applicable                KO
##   ko_strategy ko_gene library_preparation.description
## 1          WT      WT                              NA
## 2          CE  POU2F3                              NA
##   library_preparation.library_preparation_protocol_id
## 1                                            JAXPL001
## 2                                            JAXPL001
##   library_preparation.average_fragment_size
## 1                                       431
## 2                                       428
##   library_preparation.input_amount_value library_preparation.input_amount_unit
## 1                                    300                                    ng
## 2                                    300                                    ng
##   library_preparation.concentration_value
## 1                                    47.7
## 2                                    47.7
##   library_preparation.concentration_unit library_preparation.total_yield_value
## 1                                     nM                                   244
## 2                                     nM                                   244
##   library_preparation.total_yield_unit library_preparation.cdna_pcr_cycles
## 1                                   ng                                  10
## 2                                   ng                                  10
##   library_preparation.pcr_cycles_for_indexing
## 1                               Not available
## 2                               Not available
##                                                file_id
## 1 KOLF2_POU2F3_1_GT23-10491_CCGTGAAG-ATCCACTG_S38_L001
## 2  POU2F3_CE_B05_GT23-10506_GCAGAATT-TGGCCGGT_S17_L001
##                        run_id
## 1 20230809_23-robson-005-run2
## 2 20230809_23-robson-005-run2
##                                         clonal.description clonal.parental_name
## 1                                                  KOLF2.2             KOLF2.2J
## 2 KOLF2.2 deleted POU2F3 by deletion of critical exon (CE)             KOLF2.2J
##   clonal.clone_id clonal.type clonal.zygosity
## 1              WT        iPSC  Not applicable
## 2             A06        iPSC  Not applicable
##   clonal.cell_line_generation_protocol clonal.treatment_condition
## 1                       Not applicable             Not applicable
## 2                       Not applicable             Not applicable
##   clonal.wt_control_status expression_alteration.label
## 1                       WT              Not applicable
## 2                       KO    JAX_POU2F3_Critical_exon
##                                 model_organ
## 1 extra-embryonic primitive syncytial cells
## 2 extra-embryonic primitive syncytial cells
names(meta)
##  [1] "clonal.label"                                       
##  [2] "library_preparation.label"                          
##  [3] "label"                                              
##  [4] "description"                                        
##  [5] "differentiated_product_protocol_id"                 
##  [6] "model_system"                                       
##  [7] "timepoint_value"                                    
##  [8] "timepoint_unit"                                     
##  [9] "final_timepoint"                                    
## [10] "treatment_condition"                                
## [11] "wt_control_status"                                  
## [12] "ko_strategy"                                        
## [13] "ko_gene"                                            
## [14] "library_preparation.description"                    
## [15] "library_preparation.library_preparation_protocol_id"
## [16] "library_preparation.average_fragment_size"          
## [17] "library_preparation.input_amount_value"             
## [18] "library_preparation.input_amount_unit"              
## [19] "library_preparation.concentration_value"            
## [20] "library_preparation.concentration_unit"             
## [21] "library_preparation.total_yield_value"              
## [22] "library_preparation.total_yield_unit"               
## [23] "library_preparation.cdna_pcr_cycles"                
## [24] "library_preparation.pcr_cycles_for_indexing"        
## [25] "file_id"                                            
## [26] "run_id"                                             
## [27] "clonal.description"                                 
## [28] "clonal.parental_name"                               
## [29] "clonal.clone_id"                                    
## [30] "clonal.type"                                        
## [31] "clonal.zygosity"                                    
## [32] "clonal.cell_line_generation_protocol"               
## [33] "clonal.treatment_condition"                         
## [34] "clonal.wt_control_status"                           
## [35] "expression_alteration.label"                        
## [36] "model_organ"
table(table(meta$clonal.label))
## 
##  1  2 
## 66 12
table(table(meta$library_preparation.label))
## 
##  1 
## 90
table(table(meta$label))
## 
##  1 
## 90

Extract condition information

meta$label[1:6]
## [1] "PrS-MOK20010-WT"   "PrS-MOK20010C-A06" "PrS-MOK20010C-B05"
## [4] "PrS-MOK20010C-B06" "PrS-MOK20010P-A09" "PrS-MOK20010P-A10"
cell_line_id = str_split(meta$label, pattern = '-')
table(sapply(cell_line_id, length))
## 
##  3  4 
## 66 24
cell_line_id = lapply(cell_line_id, function(x) { length(x) <- 4; x})
cell_line_id = as.data.frame(do.call(rbind, cell_line_id))
cell_line_id[1:6, ]
##    V1        V2  V3   V4
## 1 PrS  MOK20010  WT <NA>
## 2 PrS MOK20010C A06 <NA>
## 3 PrS MOK20010C B05 <NA>
## 4 PrS MOK20010C B06 <NA>
## 5 PrS MOK20010P A09 <NA>
## 6 PrS MOK20010P A10 <NA>
meta$condition = cell_line_id[, 4]

meta$condition[which(is.na(meta$condition))] = "nor"
table(meta$condition, useNA="ifany")
## 
## hyp nor 
##  12  78
table(meta$model_organ, meta$condition, useNA="ifany")
##                                            
##                                             hyp nor
##   extra-embryonic mesenchymal cells           0  12
##   extra-embryonic primitive syncytial cells  12  66
table(meta$run_id, useNA="ifany")
## 
## 20230809_23-robson-005-run2      20230918_23-robson-008 
##                          30                          24 
##      20230918_23-robson-009      20230918_23-robson-010 
##                          12                          12 
##      20231004_23-robson-011 
##                          12
table(meta$run_id, meta$condition, useNA="ifany")
##                              
##                               hyp nor
##   20230809_23-robson-005-run2   0  30
##   20230918_23-robson-008       12  12
##   20230918_23-robson-009        0  12
##   20230918_23-robson-010        0  12
##   20231004_23-robson-011        0  12

Explore run id and ko gene info

table(meta$run_id, meta$model_system, useNA="ifany")
##                              
##                               ExM PrS
##   20230809_23-robson-005-run2   0  30
##   20230918_23-robson-008        0  24
##   20230918_23-robson-009        0  12
##   20230918_23-robson-010        0  12
##   20231004_23-robson-011       12   0
table(meta$model_system, meta$ko_gene, useNA="ifany")
##      
##       EPAS1 FOSB GCM1 GRHL1 ISL1 POU2F3 PPARG WT
##   ExM     0    0    0     0    9      0     0  3
##   PrS    18    9    9     9    0      9     9 15
table(meta$run_id, meta$ko_gene, useNA="ifany")
##                              
##                               EPAS1 FOSB GCM1 GRHL1 ISL1 POU2F3 PPARG WT
##   20230809_23-robson-005-run2     0    9    0     9    0      9     0  3
##   20230918_23-robson-008         18    0    0     0    0      0     0  6
##   20230918_23-robson-009          0    0    9     0    0      0     0  3
##   20230918_23-robson-010          0    0    0     0    0      0     9  3
##   20231004_23-robson-011          0    0    0     0    9      0     0  3
table(meta$ko_strategy, meta$ko_gene, useNA="ifany")
##      
##       EPAS1 FOSB GCM1 GRHL1 ISL1 POU2F3 PPARG WT
##   CE      6    3    3     3    3      3     3  0
##   KO      6    3    3     3    3      3     3  0
##   PTC     6    3    3     3    3      3     3  0
##   WT      0    0    0     0    0      0     0 18

Read in gene count data and filter genes

Manually correcting the gene strings in certain column names.

As of the time of running this code, all differences below are due to partial correction of the GHRL gene name.

It should be “GRHL1” instead of “GHRL1”.

It is fixed in the file_id column in meta table, but not fixed in the count data file column names yet.

cts = fread(file.path(dat_path, "genesCounts.csv"), data.table = FALSE)
dim(cts)
## [1] 38592    91
cts[1:2, c(1:2, (ncol(cts)-1):ncol(cts))]
##              Name POU2F3_KO_F04_GT23-10504_GGTACCTT-GACGTCTT_S33_L001
## 1 ENSG00000268674                                                   0
## 2 ENSG00000271254                                                1030
##   PTC_A09__Hypoxia_GT23-11309_AACGTTCC-AGTACTCC_S30_L001
## 1                                                      0
## 2                                                    489
##   PTC_D10__Hypoxia_GT23-11310_GCAGAATT-TGGCCGGT_S22_L001
## 1                                                      0
## 2                                                    603
setdiff(names(cts)[-1], meta$file_id)
##  [1] "GHRL1_CE_A05_GT23-10497_TTGGACTC-CTGCTTCC_S12_L001" 
##  [2] "KOLF2_GHRL1_2_GT23-10492_TTACAGGA-GCTTGTCA_S6_L001" 
##  [3] "GHRL1_PTC_A09_GT23-10500_TAATACAG-GTGAATAT_S2_L001" 
##  [4] "GHRL1_PTC_C09_GT23-10502_ATGTAAGT-CATAGAGT_S19_L001"
##  [5] "GHRL1_PTC_A10_GT23-10501_CGGCGTGA-ACAGGCGC_S30_L001"
##  [6] "GHRL1_CE_A08_GT23-10498_CAGTAGGC-ATTCGTCA_S39_L001" 
##  [7] "GHRL1_KO_D02_GT23-10494_TCTCTACT-GAACCGCG_S26_L001" 
##  [8] "GHRL1_KO_H04_GT23-10496_CCAAGTCT-TCATCCTT_S35_L001" 
##  [9] "GHRL1_CE_D07_GT23-10499_TGACGAAT-GCCTACTG_S31_L001" 
## [10] "GHRL1_KO_G04_GT23-10495_CTCTCGTC-AGGTTATA_S15_L001"
setdiff(meta$file_id, names(cts)[-1])
##  [1] "KOLF2_GRHL1_2_GT23-10492_TTACAGGA-GCTTGTCA_S6_L001" 
##  [2] "GRHL1_CE_A05_GT23-10497_TTGGACTC-CTGCTTCC_S12_L001" 
##  [3] "GRHL1_CE_A08_GT23-10498_CAGTAGGC-ATTCGTCA_S39_L001" 
##  [4] "GRHL1_CE_D07_GT23-10499_TGACGAAT-GCCTACTG_S31_L001" 
##  [5] "GRHL1_PTC_A09_GT23-10500_TAATACAG-GTGAATAT_S2_L001" 
##  [6] "GRHL1_PTC_A10_GT23-10501_CGGCGTGA-ACAGGCGC_S30_L001"
##  [7] "GRHL1_PTC_C09_GT23-10502_ATGTAAGT-CATAGAGT_S19_L001"
##  [8] "GRHL1_KO_D02_GT23-10494_TCTCTACT-GAACCGCG_S26_L001" 
##  [9] "GRHL1_KO_G04_GT23-10495_CTCTCGTC-AGGTTATA_S15_L001" 
## [10] "GRHL1_KO_H04_GT23-10496_CCAAGTCT-TCATCCTT_S35_L001"
## starting manually correcting count data file column names
names(cts) = gsub("GHRL", "GRHL", names(cts))
## end the manual correction

stopifnot(setequal(names(cts)[-1], meta$file_id))

meta = meta[match(names(cts)[-1], meta$file_id),]
table(names(cts)[-1] == meta$file_id)
## 
## TRUE 
##   90
cts_dat = data.matrix(cts[,-1])
rownames(cts_dat) = cts$Name

table(meta$timepoint_value, useNA="ifany")
## 
##  6 
## 90
meta$timepoint = as.character(meta$timepoint_value)

Read in gene annoation

gene_anno = fread("data/gencode_v44_primary_assembly_info.tsv")
dim(gene_anno)
## [1] 62754     8
gene_anno[1:2,]
##                geneId    chr strand     start       end ensembl_gene_id
##                <char> <char> <char>     <int>     <int>          <char>
## 1: ENSG00000000003.16   chrX      - 100627108 100639991 ENSG00000000003
## 2:  ENSG00000000005.6   chrX      + 100584936 100599885 ENSG00000000005
##    hgnc_symbol                                       description
##         <char>                                            <char>
## 1:      TSPAN6 tetraspanin 6 [Source:HGNC Symbol;Acc:HGNC:11858]
## 2:        TNMD   tenomodulin [Source:HGNC Symbol;Acc:HGNC:17757]
table(rownames(cts_dat) %in% gene_anno$ensembl_gene_id)
## 
##  TRUE 
## 38592
# all ensembl gene IDs are contained in the annotation
setdiff(rownames(cts_dat), gene_anno$ensembl_gene_id)
## character(0)

Discard genes whose expression is 0 in more than 75% of samples

model_s = meta$model_system
table(model_s, useNA="ifany")
## model_s
## ExM PrS 
##  12  78
get_q75 <- function(x){quantile(x, probs = 0.75)}

gene_info = data.frame(Name = cts$Name, 
                       t(apply(cts_dat, 1, tapply, model_s, min)), 
                       t(apply(cts_dat, 1, tapply, model_s, median)), 
                       t(apply(cts_dat, 1, tapply, model_s, get_q75)), 
                       t(apply(cts_dat, 1, tapply, model_s, max)))

dim(gene_info)
## [1] 38592     9
gene_info[1:2,]
##                            Name ExM PrS ExM.1 PrS.1  ExM.2 PrS.2 ExM.3 PrS.3
## ENSG00000268674 ENSG00000268674   0   0   0.0   0.0   0.00     0     0     3
## ENSG00000271254 ENSG00000271254 634 387 812.5 775.5 848.25   923   948  1169
table(row.names(gene_info)==gene_info$Name, useNA="ifany")
## 
##  TRUE 
## 38592
names(gene_info)[2:9] = paste0(rep(c("ExM_", "PrS_"), times=4), 
                               rep(c("min", "median", "q75", "max"), each=2))
dim(gene_info)
## [1] 38592     9
gene_info[1:2,]
##                            Name ExM_min PrS_min ExM_median PrS_median ExM_q75
## ENSG00000268674 ENSG00000268674       0       0        0.0        0.0    0.00
## ENSG00000271254 ENSG00000271254     634     387      812.5      775.5  848.25
##                 PrS_q75 ExM_max PrS_max
## ENSG00000268674       0       0       3
## ENSG00000271254     923     948    1169
summary(gene_info[,-1])
##     ExM_min            PrS_min           ExM_median         PrS_median      
##  Min.   :     0.0   Min.   :     0.0   Min.   :     0.0   Min.   :     0.0  
##  1st Qu.:     0.0   1st Qu.:     0.0   1st Qu.:     0.0   1st Qu.:     0.0  
##  Median :     0.0   Median :     0.0   Median :     2.0   Median :     1.0  
##  Mean   :   376.3   Mean   :   255.1   Mean   :   530.5   Mean   :   585.1  
##  3rd Qu.:   120.0   3rd Qu.:    60.0   3rd Qu.:   188.5   3rd Qu.:   182.5  
##  Max.   :512761.0   Max.   :154723.0   Max.   :797550.5   Max.   :379480.0  
##     ExM_q75            PrS_q75            ExM_max            PrS_max      
##  Min.   :     0.0   Min.   :     0.0   Min.   :     0.0   Min.   :     0  
##  1st Qu.:     0.0   1st Qu.:     0.0   1st Qu.:     0.0   1st Qu.:     1  
##  Median :     3.0   Median :     3.0   Median :     6.0   Median :    10  
##  Mean   :   608.8   Mean   :   712.7   Mean   :   751.6   Mean   :  1102  
##  3rd Qu.:   228.0   3rd Qu.:   233.8   3rd Qu.:   285.0   3rd Qu.:   405  
##  Max.   :886970.0   Max.   :456738.2   Max.   :928104.0   Max.   :801404
table(gene_info$ExM_q75 > 0, gene_info$PrS_q75 > 0)
##        
##         FALSE  TRUE
##   FALSE 12757   959
##   TRUE   2139 22737
w2kp = which(gene_info$ExM_q75 > 0 | gene_info$PrS_q75 > 0)
cts_dat = cts_dat[w2kp,]
dim(cts_dat)
## [1] 25835    90
gene_info = gene_info[w2kp,]

meta$read_depth = colSums(cts_dat)/1e6
meta$q75 = apply(cts_dat, 2, quantile, probs = 0.75)

ggplot(meta, aes(x=read_depth, y=q75, color=ko_strategy, shape = model_system)) +
    geom_point(size=2, alpha=0.7) + ggtitle("Gene counts") + 
  scale_shape_manual(values = c(7, 16)) + 
  xlab("read depth (million)") + ylab("75 percentile of gene expression") + 
  labs(color = "KO type", shape = "Model") + 
  scale_color_brewer(palette = "Set1")

meta$run_id_short = str_replace(meta$run_id, "^[^-]*-", "")

ggplot(meta, aes(x=read_depth, y=q75, color=run_id_short, shape = condition)) +
    geom_point(size=2, alpha=0.7) + ggtitle("Gene counts") + 
  scale_shape_manual(values = c(7, 16)) + 
  xlab("read depth (million)") + ylab("75 percentile of gene expression") + 
  labs(color = "Run ID", shape = "Condition") + 
  scale_color_brewer(palette = "Set1")

table(meta$model_system, meta$run_id_short)
##      
##       robson-005-run2 robson-008 robson-009 robson-010 robson-011
##   ExM               0          0          0          0         12
##   PrS              30         24         12         12          0
ggplot(meta, aes(x=read_depth, y=q75, color=run_id_short, shape = model_system)) +
    geom_point(size=2, alpha=0.7) + ggtitle("Gene counts") + 
  scale_shape_manual(values = c(7, 16)) + 
  xlab("read depth (million)") + ylab("75 percentile of gene expression") + 
  labs(color = "Run ID", shape = "Model") + 
  scale_color_brewer(palette = "Set1")

Check the effect of hypoxia vs. normoxia among WT

sample2kp = which(meta$ko_gene == "WT")
cts_dat_m = cts_dat[,sample2kp]
meta_m    = meta[sample2kp,]
summary(meta_m$q75)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   361.0   422.2   469.8   478.4   540.5   592.0
stopifnot(all(meta_m$file_id == colnames(cts_dat_m)))

q75 = apply(cts_dat_m, 1, quantile, probs=0.75)
cts_dat_n = t(cts_dat_m[q75 > 0,])
cts_dat_n = log(median(meta_m$q75)*cts_dat_n/meta_m$q75 + 1)
dim(cts_dat_n)
## [1]    18 23865
sample_pca = prcomp(cts_dat_n, center = TRUE, scale. = TRUE)
names(sample_pca)
## [1] "sdev"     "rotation" "center"   "scale"    "x"
pc_scores = sample_pca$x
eigen_vals = sample_pca$sdev^2
eigen_vals[1:5]/sum(eigen_vals)
## [1] 0.41336077 0.18367275 0.08208396 0.04567809 0.03608958
pvs = round(eigen_vals[1:5]/sum(eigen_vals)*100,1)
pvs
## [1] 41.3 18.4  8.2  4.6  3.6
PC_df = data.frame(pc_scores)
PC_df = cbind(PC_df, meta_m)

gPC = ggplot(PC_df, aes(x = PC1, y = PC2, shape=condition, color=model_system)) +
  geom_point(size=2.5, alpha=0.7) + scale_color_brewer(palette = "Dark2") +
    scale_shape_manual(values = c(7, 15, 16, 17)) + 
  labs(color="Model", shape ="Condition") + 
  xlab(sprintf("PC1 (%.1f%% variance)", pvs[1])) + 
  ylab(sprintf("PC2 (%.1f%% variance)", pvs[2])) + 
  ggtitle("Wild type samples") + 
  guides(
    color = guide_legend(title = NULL, order = 1),
    shape = guide_legend(title = NULL, order = 2)
  ) +
  theme(
    legend.margin = margin(0, 0, 0, 0), 
    legend.box.just = "left", 
    legend.direction = "vertical" 
  )

print(gPC)

gPC = ggplot(PC_df, aes(x = PC1, y = PC2, shape=condition, color=run_id_short)) +
  geom_point(size=2.5, alpha=0.7) + scale_color_brewer(palette = "Dark2") +
    scale_shape_manual(values = c(7, 15, 16, 17)) + 
  labs(color="Run id", shape ="Condition") + 
  xlab(sprintf("PC1 (%.1f%% variance)", pvs[1])) + 
  ylab(sprintf("PC2 (%.1f%% variance)", pvs[2])) + 
  ggtitle("Wild type samples") + 
  guides(
    color = guide_legend(title = NULL, order = 1),
    shape = guide_legend(title = NULL, order = 2)
  ) +
  theme(
    legend.margin = margin(0, 0, 0, 0), 
    legend.box.just = "left", 
    legend.direction = "vertical" 
  )

print(gPC)

table(meta_m$run_id, meta_m$model_system)
##                              
##                               ExM PrS
##   20230809_23-robson-005-run2   0   3
##   20230918_23-robson-008        0   6
##   20230918_23-robson-009        0   3
##   20230918_23-robson-010        0   3
##   20231004_23-robson-011        3   0

DE analysis with respect to model system for WT samples

## Generate sample information matrix

cols2kp = c("model_system", "condition", "q75")
sample_info = meta_m[,cols2kp]

dim(sample_info)
## [1] 18  3
sample_info[1:2,]
##    model_system condition   q75
## 79          PrS       nor 569.5
## 12          PrS       hyp 545.0
sample_info$log_q75 = log(sample_info$q75)

dds = DESeqDataSetFromMatrix(cts_dat_m, colData=sample_info, 
                                  ~ model_system + condition + log_q75)
dds = DESeq(dds)

## association with read-depth
res_rd = results(dds, name = "log_q75")
res_rd = as.data.frame(res_rd)
dim(res_rd)
## [1] 25835     6
res_rd[1:2,]
##                  baseMean log2FoldChange     lfcSE       stat    pvalue
## ENSG00000271254 731.89279      0.0114876 0.2318513 0.04954726 0.9604832
## ENSG00000277196  46.50349      0.9817827 1.3388605 0.73329722 0.4633772
##                      padj
## ENSG00000271254 0.9997047
## ENSG00000277196 0.9997047
table(is.na(res_rd$padj))
## 
## FALSE  TRUE 
## 17751  8084
g0 = ggplot(res_rd %>% subset(!is.na(padj)), aes(x=pvalue)) + 
  geom_histogram(color="darkblue", fill="lightblue", 
                 breaks = seq(0,1,by=0.01)) + 
  ggtitle("WT Read depth")
print(g0)

## association with model_system
dds_lrt = DESeq(dds, test="LRT", reduced = ~ condition + log_q75)
res_lrt = results(dds_lrt)

res_model_system = as.data.frame(res_lrt)
dim(res_model_system)
## [1] 25835     6
res_model_system[1:2,]
##                  baseMean log2FoldChange     lfcSE       stat     pvalue
## ENSG00000271254 731.89279      0.0114876 0.2318513 4.06378556 0.04381218
## ENSG00000277196  46.50349      0.9817827 1.3388605 0.04552192 0.83104721
##                       padj
## ENSG00000271254 0.06908888
## ENSG00000277196 0.87097036
table(is.na(res_model_system$padj))
## 
## FALSE  TRUE 
## 23736  2099
g0 = ggplot(res_model_system %>% subset(!is.na(padj)), aes(x=pvalue)) + 
  geom_histogram(color="darkblue", fill="lightblue", 
                 breaks = seq(0,1,by=0.01)) + 
  ggtitle("WT Model system")
print(g0)

Analyze samples of different model systems or conditions

For the December 2023 release of JAX_RNAseq_ExtraEmbryonic, there are two model systems: ExM and PrS, and two conditions, nor and hyp.

There are three gene knock out strategies.

Explore the PC plots first, to decide which level to run DESeq2 on.

Then run the DE analysis.

Since there is only one day value for each model system, do not include day value in each scatter plot.

meta$model_condition = paste(meta$model_system, meta$condition, sep="_")
table(meta$model_condition)
## 
## ExM_nor PrS_hyp PrS_nor 
##      12      12      66
table(meta$run_id, meta$model_condition)
##                              
##                               ExM_nor PrS_hyp PrS_nor
##   20230809_23-robson-005-run2       0       0      30
##   20230918_23-robson-008            0      12      12
##   20230918_23-robson-009            0       0      12
##   20230918_23-robson-010            0       0      12
##   20231004_23-robson-011           12       0       0
unique_model_condition_ss = unique(meta$model_condition)
unique_model_condition_ss = sort(unique_model_condition_ss)

for(cur_mc in unique_model_condition_ss){
  
  print(cur_mc)
  sample2kp = which(meta$model_condition == cur_mc)
  cts_dat_m = cts_dat[,sample2kp]
  meta_m    = meta[sample2kp,]

  print(table(meta_m$ko_strategy, str_extract(colnames(cts_dat_m), "^[^_]+_[^_]+")))
  extracted_ko_strings = str_extract_all(colnames(cts_dat_m), "CE|KO|PTC|WT")
  print(table(sapply(extracted_ko_strings, length)))
  print(table(meta_m$ko_strategy, unlist(extracted_ko_strings)))
  
  if (cur_mc == "PrS_nor"){
    print("The overlap between WT and KO is not real overlap. It is due to a different part of the filename contains substring 'KO' in these three samples:")
    print(meta_m$file_id[(meta_m$ko_strategy=="WT") & (unlist(extracted_ko_strings)=="KO")])
  }
  
  stopifnot(all(meta_m$file_id == colnames(cts_dat_m)))
  q75 = apply(cts_dat_m, 1, quantile, probs=0.75)

  cts_dat_n = t(cts_dat_m[q75 > 0,])
  cts_dat_n = log(median(meta_m$q75)*cts_dat_n/meta_m$q75 + 1)
  dim(cts_dat_n)
  
  sample_pca = prcomp(cts_dat_n, center = TRUE, scale. = TRUE)
  names(sample_pca)
  pc_scores = sample_pca$x
  eigen_vals = sample_pca$sdev^2
  eigen_vals[1:5]/sum(eigen_vals)
  pvs = round(eigen_vals[1:5]/sum(eigen_vals)*100,1)
  pvs
  
  PC_df = data.frame(pc_scores)
  PC_df = cbind(PC_df, meta_m)
  
  gPC = ggplot(PC_df, aes(x = PC1, y = PC2, shape=ko_strategy, color=ko_gene)) +
    geom_point(size=2.5, alpha=0.7) + scale_color_brewer(palette = "Dark2") +
      scale_shape_manual(values = c(7, 15, 16, 17)) + 
    xlab(sprintf("PC1 (%.1f%% variance)", pvs[1])) + 
    ylab(sprintf("PC2 (%.1f%% variance)", pvs[2])) + 
    ggtitle(cur_mc) + 
    guides(
      color = guide_legend(title = NULL, order = 1),
      shape = guide_legend(title = NULL, order = 2)
    ) +
    theme(
      legend.margin = margin(0, 0, 0, 0), 
      legend.box.just = "left", 
      legend.direction = "vertical" 
    )

  print(gPC)
  
  gPC = ggplot(PC_df, aes(x = PC1, y = PC2, shape=run_id_short, color=ko_gene)) +
    geom_point(size=2.5, alpha=0.7) + scale_color_brewer(palette = "Dark2") +
      scale_shape_manual(values = c(7, 15, 16, 17)) + 
    xlab(sprintf("PC1 (%.1f%% variance)", pvs[1])) + 
    ylab(sprintf("PC2 (%.1f%% variance)", pvs[2])) + 
    ggtitle(cur_mc) + 
    guides(
      color = guide_legend(title = NULL, order = 1),
      shape = guide_legend(title = NULL, order = 2)
    ) +
    theme(
      legend.margin = margin(0, 0, 0, 0), 
      legend.box.just = "left", 
      legend.direction = "vertical" 
    )

  print(gPC)
  
  
  gPC = ggplot(PC_df, aes(x = PC1, y = PC2, shape=run_id_short, color=ko_gene, 
                          alpha = ifelse(ko_gene == "WT", 1, 0.8))) +
    geom_point(size=2.5) + 
    scale_color_brewer(palette = "Dark2") + 
    scale_shape_manual(values = c(7, 15, 16, 17)) + 
    xlab(sprintf("PC1 (%.1f%% variance)", pvs[1])) + 
    ylab(sprintf("PC2 (%.1f%% variance)", pvs[2])) + 
    ggtitle(cur_mc) + guides(alpha = "none") + 
    guides(
      color = guide_legend(title = NULL, order = 1),
      shape = guide_legend(title = NULL, order = 2)
    ) +
    theme(
      legend.margin = margin(0, 0, 0, 0), 
      legend.box.just = "left", 
      legend.direction = "vertical" 
    )

  print(gPC)
  table(meta_m$run_id, meta_m$ko_gene)
  
}
## [1] "ExM_nor"
##      
##       ISL1_CE ISL1_KO ISL1_PTC ISL1_WT
##   CE        3       0        0       0
##   KO        0       3        0       0
##   PTC       0       0        3       0
##   WT        0       0        0       3
## 
##  1 
## 12 
##      
##       CE KO PTC WT
##   CE   3  0   0  0
##   KO   0  3   0  0
##   PTC  0  0   3  0
##   WT   0  0   0  3

## [1] "PrS_hyp"
##      
##       CE_E06 CE_G07 CE_H06 KO_A03 KO_B01 KO_B02 PTC_A09 PTC_D10 PTC_G09 WT_1
##   CE       1      1      1      0      0      0       0       0       0    0
##   KO       0      0      0      1      1      1       0       0       0    0
##   PTC      0      0      0      0      0      0       1       1       1    0
##   WT       0      0      0      0      0      0       0       0       0    1
##      
##       WT_2 WT_3
##   CE     0    0
##   KO     0    0
##   PTC    0    0
##   WT     1    1
## 
##  1 
## 12 
##      
##       CE KO PTC WT
##   CE   3  0   0  0
##   KO   0  3   0  0
##   PTC  0  0   3  0
##   WT   0  0   0  3

## [1] "PrS_nor"
##      
##       CE_E06 CE_G07 CE_H06 FOSB_CE FOSB_KO FOSB_PTC GCM1_CE GCM1_KO GCM1_PTC
##   CE       1      1      1       3       0        0       3       0        0
##   KO       0      0      0       0       3        0       0       3        0
##   PTC      0      0      0       0       0        3       0       0        3
##   WT       0      0      0       0       0        0       0       0        0
##      
##       GCM1_WT GRHL1_CE GRHL1_KO GRHL1_PTC KO_A03 KO_B01 KO_B02 KOLF2_FOSB
##   CE        0        3        0         0      0      0      0          0
##   KO        0        0        3         0      1      1      1          0
##   PTC       0        0        0         3      0      0      0          0
##   WT        3        0        0         0      0      0      0          1
##      
##       KOLF2_GRHL1 KOLF2_POU2F3 POU2F3_CE POU2F3_KO POU2F3_PTC PPARG_CE PPARG_KO
##   CE            0            0         3         0          0        3        0
##   KO            0            0         0         3          0        0        3
##   PTC           0            0         0         0          3        0        0
##   WT            1            1         0         0          0        0        0
##      
##       PPARG_PTC PPARG_WT PTC_A09 PTC_D10 PTC_G09 WT_1 WT_2 WT_3
##   CE          0        0       0       0       0    0    0    0
##   KO          0        0       0       0       0    0    0    0
##   PTC         3        0       1       1       1    0    0    0
##   WT          0        3       0       0       0    1    1    1
## 
##  1 
## 66 
##      
##       CE KO PTC WT
##   CE  18  0   0  0
##   KO   0 18   0  0
##   PTC  0  0  18  0
##   WT   0  3   0  9
## [1] "The overlap between WT and KO is not real overlap. It is due to a different part of the filename contains substring 'KO' in these three samples:"
## [1] "KOLF2_GRHL1_2_GT23-10492_TTACAGGA-GCTTGTCA_S6_L001"  
## [2] "KOLF2_FOSB_1_GT23-10493_GACCTGAA-CTCACCAA_S32_L001"  
## [3] "KOLF2_POU2F3_1_GT23-10491_CCGTGAAG-ATCCACTG_S38_L001"

Use each combination of (model system, condition) as DE group.

Run analysis for each DE group separately.

For the output files, create two versions, one with direct information, one with padj set to NA for the genes when the number of 0 per test is >=4.

In more details, for both versions of the outputs, make it one file per knockout gene per DE group (DE group can be model system, model system + condition, model system + run_id, model system + day, etc.). The first version of the output files contains

gene id, gene symbol

and for each knockout strategy:

baseMean, log2FoldChange, lfcSE, stat, pvalue, padj

The second version of the output files contains

gene id, gene symbol, chr, strand, position

and for each knockout strategy:

log2FoldChange, pvalue, padj (set to be NA if the number of 0 per test >= 4), 

In addition, save out a separate file of the sample size for each comparison. This needs to be by DE group.

Prepare output directories for DE results

df_size = NULL

release_name = "2023_12_JAX_RNAseq_ExtraEmbryonic"

output_dir = file.path("results", release_name)
raw_output_dir = file.path(output_dir, "raw")
processed_output_dir = file.path(output_dir, "processed")

if (!file.exists(raw_output_dir)){
  dir.create(raw_output_dir, recursive = TRUE)
}

if (!file.exists(processed_output_dir)){
  dir.create(processed_output_dir, recursive = TRUE)
}

DE analysis for all genes

unique_model_ss = unique(meta$model_system)
unique_model_ss = sort(unique_model_ss)
unique_model_ss
## [1] "ExM" "PrS"
for (model1 in unique_model_ss){
  
  print(model1)
  sample2kp = which(meta$model_system == model1)
  cts_dat_m = cts_dat[,sample2kp]
  meta_m    = meta[sample2kp,]

  stopifnot(all(meta_m$file_id == colnames(cts_dat_m)))
  print(table(meta_m$file_id == colnames(cts_dat_m), useNA="ifany"))  
  
  ## Generate sample information matrix
  cols2kp = c("model_system", "condition", "ko_strategy", "ko_gene", "run_id", 
              "run_id_short", "q75", "timepoint")
  sample_info = meta_m[,cols2kp]
  colnames(sample_info)[which(colnames(sample_info)=="timepoint")] = "day"
    
  dim(sample_info)
  sample_info[1:2,]
      
  print(table(sample_info$ko_strategy, sample_info$ko_gene))
  
  sample_info$ko_group = paste(sample_info$ko_gene, 
                               sample_info$ko_strategy, sep="_")
  print(table(sample_info$ko_group))
  
  # use the combination of (model system, condition) as DE group
  
  sample_info$model_day = paste0(sample_info$model_system, 
                                 "_day_", 
                                 as.character(sample_info$day))
    
  sample_info$de_grp = paste0(sample_info$model_day, "_", 
                              sample_info$condition)

  sorted_de_grps = sort(unique(sample_info$de_grp))
  sorted_de_grps

  for(d_group in sorted_de_grps){
    
    print(d_group)
    
    dg_index = which(sample_info$de_grp==d_group)
    cts_dat_m_dg = cts_dat_m[, dg_index]
    sample_info_dg = sample_info[dg_index, ]

    stopifnot(all(sample_info_dg$de_grp==d_group))   
    
    print("-----------------------------------")      
    print(paste0("Model system: ", model1))  
    print(sprintf("DE analysis group %s DESeq2 results:", d_group))
    print("-----------------------------------")  
      

    # do two steps of filtering
    # first, filter based on q75 of each gene
    # second, filter based on whether each gene is expressed in at least 4 cells
  
    dg_q75 = apply(cts_dat_m_dg, 1, quantile, probs=0.75)
    cts_dat_m_dg = cts_dat_m_dg[dg_q75 > 0,]
    
    print(sprintf("After filtering by gene expression q75, the number of genes reduces from %d to %d", 
                  length(dg_q75), nrow(cts_dat_m_dg)))
    
    n_pos = rowSums(cts_dat_m_dg>0)
    cts_dat_m_dg = cts_dat_m_dg[n_pos >= 4,]
    
    if (length(n_pos)==nrow(cts_dat_m_dg)){
      print("After filtering by nonzero expression in at least 4 samples, the number of genes does not change.")
    }else{
      print(sprintf("After filtering by nonzero expression in at least 4 samples, the number of genes reduces from %d to %d", 
                    length(n_pos), nrow(cts_dat_m_dg)))    
    }
  
    # update the q75 based on only genes used in the process
    sample_info_dg$q75 = apply(cts_dat_m_dg, 2, quantile, probs = 0.75)
    sample_info_dg$log_q75 = log(sample_info_dg$q75)   

    
    if (length(unique(sample_info_dg$run_id))==1){
      dds = DESeqDataSetFromMatrix(cts_dat_m_dg, colData=sample_info_dg, 
                                    ~ ko_group + log_q75)
    }else{
      dds = DESeqDataSetFromMatrix(cts_dat_m_dg, colData=sample_info_dg, 
                                    ~ ko_group + run_id + log_q75)     
    }
    
    start.time <- Sys.time()
    dds = DESeq(dds)
    end.time <- Sys.time()
    
    time.taken <- end.time - start.time
    print(time.taken)
    
    ## association with read-depth
    res_rd = results(dds, name = "log_q75")
    res_rd = as.data.frame(res_rd)
    dim(res_rd)
    res_rd[1:2,]
    
    table(is.na(res_rd$padj))
    
    g0 = ggplot(res_rd %>% subset(!is.na(padj)), aes(x=pvalue)) + 
      geom_histogram(color="darkblue", fill="lightblue", 
                     breaks = seq(0,1,by=0.01)) + 
      ggtitle(paste0(d_group, " read depth"))
    print(g0)
    
    ## Rerun the analysis without read-depth if it is not significant for 
    ## most genes, and the number of samples is small
    ## i.e., proportion of non-null in the distribution is smaller than 0.01
    ## (this needs to be restricted to the genes with not NA adjusted pvalue)
    ## or if smaller than 0.1 and the number of samples is not greater than 6

    pi_1_rd = max(0, mean(res_rd$pvalue[!is.na(res_rd$padj)] < 0.05) - 0.05)
    pi_1_rd = max(pi_1_rd, 0, 1 - 2*mean(res_rd$pvalue[!is.na(res_rd$padj)] > 0.5))
    
    print(sprintf("prop of non-null for rd: %.2e", pi_1_rd))
    
    if(pi_1_rd < 0.01 || (ncol(cts_dat_m_dg) <= 6 && pi_1_rd < 0.1 )){
      
      print("rerun DESeq2 without read depth")
      
      include_rd = 0
      if (length(unique(sample_info_dg$run_id))==1){
        dds = DESeqDataSetFromMatrix(cts_dat_m_dg, colData=sample_info_dg, 
                                      ~ ko_group)
      }else{
        dds = DESeqDataSetFromMatrix(cts_dat_m_dg, colData=sample_info_dg, 
                                      ~ ko_group + run_id)     
      }
      start.time <- Sys.time()
      dds = DESeq(dds)
      end.time <- Sys.time()
      
      time.taken <- end.time - start.time
      print(time.taken)
      
    }else{
      include_rd = 1
    }
    
    
     ## association with run_id
    
    if (length(unique(sample_info_dg$run_id))>1){
      
      if(include_rd){
        dds_lrt = DESeq(dds, test="LRT", reduced = ~ ko_group + log_q75)
      }else{
        dds_lrt = DESeq(dds, test="LRT", reduced = ~ ko_group)
      }
      
      res_lrt = results(dds_lrt)
    
      res_run_id = as.data.frame(res_lrt)
      dim(res_run_id)
      res_run_id[1:2,]
    
      table(is.na(res_run_id$padj))
      
      g0 = ggplot(res_run_id %>% subset(!is.na(padj)), aes(x=pvalue)) + 
        geom_histogram(color="darkblue", fill="lightblue", 
                       breaks = seq(0,1,by=0.01)) + 
        ggtitle(paste0(d_group, " Run ID"))
      print(g0)
      
    }   
    
    ## DE analysis for each KO gene
    table(sample_info_dg$ko_gene)
    table(sample_info_dg$ko_gene, sample_info_dg$ko_strategy)
    
    genos = setdiff(unique(sample_info_dg$ko_gene), "WT")
    genos = sort(genos)
    genos
    
    ko_grp  = c("CE", "KO", "PTC")
      
    for(geno in genos){
      
      res_geno_df = NULL
      res_geno_reduced_df = NULL
      res_geno = list()
      
      for(k_grp1 in ko_grp){
        
        res_g1 = results(dds, contrast = c("ko_group", 
                                           paste0(geno, "_", k_grp1), "WT_WT"))
        res_g1 = signif(data.frame(res_g1), 3)
        
        # add a column to record the number of zeros per test
        test_index = which(sample_info_dg$ko_group%in%c(paste0(geno, "_", k_grp1), "WT_WT"))
        
        cts_dat_m_dg_test = cts_dat_m_dg[, test_index]
        n_zero = rowSums(cts_dat_m_dg_test==0)
        res_g1$n_zeros = n_zero
        
        # record the number of samples involved in the comparison
        test_ko_group_vec = sample_info_dg$ko_group[test_index]
        
        df_size = rbind(df_size, 
                        c(d_group, 
                          paste0(geno, "_", k_grp1), 
                          sum(test_ko_group_vec!="WT_WT"), 
                          sum(test_ko_group_vec=="WT_WT")))
        
        # prepare a processed version of output
        res_g1_reduced = res_g1[, c("log2FoldChange", "pvalue", "padj", "n_zeros")]
        res_g1_reduced$padj[which(res_g1_reduced$n_zeros>=4)] = NA
        
        if ((sum(test_ko_group_vec!="WT_WT")==1)|(sum(test_ko_group_vec=="WT_WT")==1)){
          res_g1_reduced$padj = NA
        }
        
        res_g1_reduced = res_g1_reduced[, c("log2FoldChange", "pvalue", "padj")]
        
        res_geno[[k_grp1]] = res_g1_reduced
        
        if ((sum(test_ko_group_vec!="WT_WT")>1) & (sum(test_ko_group_vec=="WT_WT")>1)){
          g1 = ggplot(res_g1_reduced %>% subset(!is.na(padj)), aes(x=pvalue)) + 
            geom_histogram(color="darkblue", fill="lightblue", 
                           breaks = seq(0,1,by=0.01)) + 
            ggtitle(paste0(d_group, " ", geno, "_", k_grp1))
          print(g1)
        }

        
        tag1 = sprintf("%s_%s_", geno, k_grp1)
        colnames(res_g1) = paste0(tag1, colnames(res_g1))
        colnames(res_g1_reduced) = paste0(tag1, colnames(res_g1_reduced))      
        
        if(is.null(res_geno_df)){
          res_geno_df = res_g1
        }else if(!is.null(res_geno_df)){
          stopifnot(all(rownames(res_geno_df) == rownames(res_g1)))
          res_geno_df = cbind(res_geno_df, res_g1)
        }

        if(is.null(res_geno_reduced_df)){
          res_geno_reduced_df = res_g1_reduced
        }else if(!is.null(res_geno_reduced_df)){
          stopifnot(all(rownames(res_geno_reduced_df) == rownames(res_g1_reduced)))
          res_geno_reduced_df = cbind(res_geno_reduced_df, res_g1_reduced)
        }       

      }
      
      get_index <- function(x){
        which(x$padj < 0.05 & abs(x$log2FoldChange) > log2(1.5))
      }
      
      w_ce = get_index(res_geno[["CE"]])
      w_ko = get_index(res_geno[["KO"]])
      w_ptc = get_index(res_geno[["PTC"]])
 
      print(paste0("DE group ", d_group, " under KO gene ", geno, ":"))
      
      print(paste0("number of DE genes from knock out strategy CE: ", 
                   as.character(length(w_ce))))
      print(paste0("number of DE genes from knock out strategy KO: ", 
                   as.character(length(w_ko))))
      print(paste0("number of DE genes from knock out strategy PTC: ", 
                   as.character(length(w_ptc))))

      ce_ko_overlap = length(intersect(rownames(res_geno[["CE"]])[w_ce], 
                                       rownames(res_geno[["KO"]])[w_ko]))
      
      ko_ptc_overlap = length(intersect(rownames(res_geno[["KO"]])[w_ko], 
                                        rownames(res_geno[["PTC"]])[w_ptc]))
      
      ptc_ce_overlap = length(intersect(rownames(res_geno[["PTC"]])[w_ptc], 
                                        rownames(res_geno[["CE"]])[w_ce]))
      
      print(paste0("number of common DE genes by knock out strategies CE and KO: ", 
                   as.character(ce_ko_overlap)))
      print(paste0("number of common DE genes by knock out strategies KO and PTC: ", 
                   as.character(ko_ptc_overlap)))
      print(paste0("number of common DE genes by knock out strategies PTC and CE: ", 
                   as.character(ptc_ce_overlap)))

      if (max(length(w_ce), length(w_ko), length(w_ptc)) > 0){
        m = make_comb_mat(list("CE" = rownames(res_geno[["CE"]])[w_ce], 
                               "KO" = rownames(res_geno[["KO"]])[w_ko], 
                               "PTC" = rownames(res_geno[["PTC"]])[w_ptc]))
        g_up = UpSet(m)
        print(g_up)
      }
      
      df1 = data.frame(padj_CE = res_geno[["CE"]]$padj, 
                       padj_KO = res_geno[["KO"]]$padj, 
                       padj_PTC = res_geno[["PTC"]]$padj)
  
      cr1 = cor(df1$padj_PTC, df1$padj_CE, method = "spearman", use="pair")
      cr2 = cor(df1$padj_PTC, df1$padj_KO, method = "spearman", use="pair")
      
      g4 = ggplot(data = df1, mapping = aes(x = -log10(padj_PTC), 
                                            y = -log10(padj_CE))) +
        geom_pointdensity() + 
        ggtitle(sprintf("%s, %s\nSpearman r = %.2f", d_group, geno, cr1)) + 
        scale_color_viridis()
      
      g5 = ggplot(data = df1, mapping = aes(x = -log10(padj_PTC), 
                                            y = -log10(padj_KO))) +
        geom_pointdensity() + 
        ggtitle(sprintf("%s, %s\nSpearman r = %.2f", d_group, geno, cr2)) + 
        scale_color_viridis()
      
      print(g4)
      print(g5)

    
      dim(res_geno_df)
      res_geno_df[1:2,]
      
      dim(res_geno_reduced_df)
      res_geno_reduced_df[1:2,]
      
      res_df = data.frame(gene_ID=rownames(res_geno_df), 
                          res_geno_df)
      dim(res_df)
      res_df[1:2,]
  
      res_reduced_gene_anno = gene_anno[match(rownames(res_geno_reduced_df), 
                                              gene_anno$ensembl_gene_id), ]
      stopifnot(all(rownames(res_geno_reduced_df)==res_reduced_gene_anno$ensembl_gene_id))
  
      # set gene hgnc_symbol that equal "" to NA
      hgnc_symbol_vec = res_reduced_gene_anno$hgnc_symbol
      hgnc_symbol_vec[which(hgnc_symbol_vec=="")] = NA
        
      res_reduced_df = data.frame(gene_ID=rownames(res_geno_reduced_df), 
                                  hgnc_symbol=hgnc_symbol_vec,
                                  chr=res_reduced_gene_anno$chr, 
                                  strand=res_reduced_gene_anno$strand, 
                                  start=res_reduced_gene_anno$start, 
                                  end=res_reduced_gene_anno$end, 
                                  res_geno_reduced_df)
      
      print("hgnc_symbol empty string and NA tables:")
      print(table(res_reduced_df$hgnc_symbol=="", useNA="ifany"))
      print(table(is.na(res_reduced_df$hgnc_symbol)))
        
      dim(res_reduced_df)
      res_reduced_df[1:2,]
  
      setting_name = unique(paste0(sample_info_dg$model_day, "_", 
                                   sample_info_dg$condition))
      
      fwrite(res_df, 
             sprintf("%s/%s_%s_%s_DEseq2_raw.tsv", 
                     raw_output_dir, release_name, setting_name, geno), 
             sep="\t")
      fwrite(res_reduced_df, 
             sprintf("%s/%s_%s_%s_DEseq2.tsv", 
                     processed_output_dir, release_name, setting_name, geno), 
             sep="\t")
    
    }
    
  }

}
## [1] "ExM"
## 
## TRUE 
##   12 
##      
##       ISL1 WT
##   CE     3  0
##   KO     3  0
##   PTC    3  0
##   WT     0  3
## 
##  ISL1_CE  ISL1_KO ISL1_PTC    WT_WT 
##        3        3        3        3 
## [1] "ExM_day_6_nor"
## [1] "-----------------------------------"
## [1] "Model system: ExM"
## [1] "DE analysis group ExM_day_6_nor DESeq2 results:"
## [1] "-----------------------------------"
## [1] "After filtering by gene expression q75, the number of genes reduces from 25835 to 24876"
## [1] "After filtering by nonzero expression in at least 4 samples, the number of genes reduces from 24876 to 23842"
## Time difference of 1.552859 mins

## [1] "prop of non-null for rd: 0.00e+00"
## [1] "rerun DESeq2 without read depth"
## Time difference of 8.436475 secs

## [1] "DE group ExM_day_6_nor under KO gene ISL1:"
## [1] "number of DE genes from knock out strategy CE: 652"
## [1] "number of DE genes from knock out strategy KO: 405"
## [1] "number of DE genes from knock out strategy PTC: 432"
## [1] "number of common DE genes by knock out strategies CE and KO: 343"
## [1] "number of common DE genes by knock out strategies KO and PTC: 324"
## [1] "number of common DE genes by knock out strategies PTC and CE: 380"

## [1] "hgnc_symbol empty string and NA tables:"
## 
## FALSE  <NA> 
## 18935  4907 
## 
## FALSE  TRUE 
## 18935  4907 
## [1] "PrS"
## 
## TRUE 
##   78 
##      
##       EPAS1 FOSB GCM1 GRHL1 POU2F3 PPARG WT
##   CE      6    3    3     3      3     3  0
##   KO      6    3    3     3      3     3  0
##   PTC     6    3    3     3      3     3  0
##   WT      0    0    0     0      0     0 15
## 
##   EPAS1_CE   EPAS1_KO  EPAS1_PTC    FOSB_CE    FOSB_KO   FOSB_PTC    GCM1_CE 
##          6          6          6          3          3          3          3 
##    GCM1_KO   GCM1_PTC   GRHL1_CE   GRHL1_KO  GRHL1_PTC  POU2F3_CE  POU2F3_KO 
##          3          3          3          3          3          3          3 
## POU2F3_PTC   PPARG_CE   PPARG_KO  PPARG_PTC      WT_WT 
##          3          3          3          3         15 
## [1] "PrS_day_6_hyp"
## [1] "-----------------------------------"
## [1] "Model system: PrS"
## [1] "DE analysis group PrS_day_6_hyp DESeq2 results:"
## [1] "-----------------------------------"
## [1] "After filtering by gene expression q75, the number of genes reduces from 25835 to 23911"
## [1] "After filtering by nonzero expression in at least 4 samples, the number of genes reduces from 23911 to 23326"
## Time difference of 9.442636 secs

## [1] "prop of non-null for rd: 8.49e-02"

## [1] "DE group PrS_day_6_hyp under KO gene EPAS1:"
## [1] "number of DE genes from knock out strategy CE: 4196"
## [1] "number of DE genes from knock out strategy KO: 2140"
## [1] "number of DE genes from knock out strategy PTC: 2553"
## [1] "number of common DE genes by knock out strategies CE and KO: 1883"
## [1] "number of common DE genes by knock out strategies KO and PTC: 1628"
## [1] "number of common DE genes by knock out strategies PTC and CE: 1960"

## [1] "hgnc_symbol empty string and NA tables:"
## 
## FALSE  <NA> 
## 18654  4672 
## 
## FALSE  TRUE 
## 18654  4672 
## [1] "PrS_day_6_nor"
## [1] "-----------------------------------"
## [1] "Model system: PrS"
## [1] "DE analysis group PrS_day_6_nor DESeq2 results:"
## [1] "-----------------------------------"
## [1] "After filtering by gene expression q75, the number of genes reduces from 25835 to 23493"
## [1] "After filtering by nonzero expression in at least 4 samples, the number of genes does not change."
## Time difference of 1.405563 mins

## [1] "prop of non-null for rd: 1.30e-01"

## [1] "DE group PrS_day_6_nor under KO gene EPAS1:"
## [1] "number of DE genes from knock out strategy CE: 1372"
## [1] "number of DE genes from knock out strategy KO: 40"
## [1] "number of DE genes from knock out strategy PTC: 304"
## [1] "number of common DE genes by knock out strategies CE and KO: 37"
## [1] "number of common DE genes by knock out strategies KO and PTC: 29"
## [1] "number of common DE genes by knock out strategies PTC and CE: 236"

## [1] "hgnc_symbol empty string and NA tables:"
## 
## FALSE  <NA> 
## 18811  4682 
## 
## FALSE  TRUE 
## 18811  4682

## [1] "DE group PrS_day_6_nor under KO gene FOSB:"
## [1] "number of DE genes from knock out strategy CE: 409"
## [1] "number of DE genes from knock out strategy KO: 847"
## [1] "number of DE genes from knock out strategy PTC: 121"
## [1] "number of common DE genes by knock out strategies CE and KO: 324"
## [1] "number of common DE genes by knock out strategies KO and PTC: 103"
## [1] "number of common DE genes by knock out strategies PTC and CE: 43"

## [1] "hgnc_symbol empty string and NA tables:"
## 
## FALSE  <NA> 
## 18811  4682 
## 
## FALSE  TRUE 
## 18811  4682

## [1] "DE group PrS_day_6_nor under KO gene GCM1:"
## [1] "number of DE genes from knock out strategy CE: 5299"
## [1] "number of DE genes from knock out strategy KO: 4486"
## [1] "number of DE genes from knock out strategy PTC: 4255"
## [1] "number of common DE genes by knock out strategies CE and KO: 4013"
## [1] "number of common DE genes by knock out strategies KO and PTC: 3770"
## [1] "number of common DE genes by knock out strategies PTC and CE: 3823"

## [1] "hgnc_symbol empty string and NA tables:"
## 
## FALSE  <NA> 
## 18811  4682 
## 
## FALSE  TRUE 
## 18811  4682

## [1] "DE group PrS_day_6_nor under KO gene GRHL1:"
## [1] "number of DE genes from knock out strategy CE: 3439"
## [1] "number of DE genes from knock out strategy KO: 2483"
## [1] "number of DE genes from knock out strategy PTC: 5095"
## [1] "number of common DE genes by knock out strategies CE and KO: 1980"
## [1] "number of common DE genes by knock out strategies KO and PTC: 2044"
## [1] "number of common DE genes by knock out strategies PTC and CE: 3057"

## [1] "hgnc_symbol empty string and NA tables:"
## 
## FALSE  <NA> 
## 18811  4682 
## 
## FALSE  TRUE 
## 18811  4682

## [1] "DE group PrS_day_6_nor under KO gene POU2F3:"
## [1] "number of DE genes from knock out strategy CE: 2038"
## [1] "number of DE genes from knock out strategy KO: 590"
## [1] "number of DE genes from knock out strategy PTC: 6"
## [1] "number of common DE genes by knock out strategies CE and KO: 462"
## [1] "number of common DE genes by knock out strategies KO and PTC: 4"
## [1] "number of common DE genes by knock out strategies PTC and CE: 3"

## [1] "hgnc_symbol empty string and NA tables:"
## 
## FALSE  <NA> 
## 18811  4682 
## 
## FALSE  TRUE 
## 18811  4682

## [1] "DE group PrS_day_6_nor under KO gene PPARG:"
## [1] "number of DE genes from knock out strategy CE: 4503"
## [1] "number of DE genes from knock out strategy KO: 3029"
## [1] "number of DE genes from knock out strategy PTC: 4014"
## [1] "number of common DE genes by knock out strategies CE and KO: 2690"
## [1] "number of common DE genes by knock out strategies KO and PTC: 2334"
## [1] "number of common DE genes by knock out strategies PTC and CE: 3173"

## [1] "hgnc_symbol empty string and NA tables:"
## 
## FALSE  <NA> 
## 18811  4682 
## 
## FALSE  TRUE 
## 18811  4682

Save the size dataframe out

colnames(df_size) = c("DE_group", "knockout_gene_strategy", "n_ko", "n_WT")
df_size = as.data.frame(df_size)

multi_rows = which(table(df_size$knockout_gene_strategy)>1)
multi_rows
##  EPAS1_CE  EPAS1_KO EPAS1_PTC 
##         1         2         3
df_size[which(df_size$knockout_gene_strategy%in%names(multi_rows)),]
##        DE_group knockout_gene_strategy n_ko n_WT
## 4 PrS_day_6_hyp               EPAS1_CE    3    3
## 5 PrS_day_6_hyp               EPAS1_KO    3    3
## 6 PrS_day_6_hyp              EPAS1_PTC    3    3
## 7 PrS_day_6_nor               EPAS1_CE    3   12
## 8 PrS_day_6_nor               EPAS1_KO    3   12
## 9 PrS_day_6_nor              EPAS1_PTC    3   12
fwrite(df_size, 
       sprintf("%s/%s_DE_n_samples.tsv", output_dir, release_name), 
       sep="\t")

DE analysis with respect to condition (hypoxia vs. normoxia)

This is only relevant for model system PrS.

Within model system PrS, for WT samples, only use the WT samples for EPAS1 gene for this analysis.

There are 24 samples involved. 6 WT and 6 EPAS1 knockout samples for each knockout strategy. Half of them are under normoxia and the other half are under hypoxia. All the 24 samples are from run_id “20230918_23-robson-008”.

For the 6 WT samples, we do not know whether they are from 3 single cell clones, with each clone having two groups of cells developed under different conditions. For now, run regular DESeq2 on them.

For the 6 knockout samples under each knockout strategy, they are from three single cell clones, with each clone having two groups of cells developed under different conditions. Run paired DESeq2 on them.

df_size = NULL

meta$ko_group = paste(meta$ko_gene, meta$ko_strategy, sep="_")
relevant_ko_groups = unique(meta$ko_group[which(meta$condition=="hyp")])
relevant_ko_groups = sort(relevant_ko_groups)

relevant_samples = meta$label[which(meta$run_id=="20230918_23-robson-008")]
relevant_samples
##  [1] "PrS-MOK20012C-G07-nor" "PrS-MOK20012-WT1-hyp"  "PrS-MOK20012P-A09-nor"
##  [4] "PrS-MOK20012W-B02-hyp" "PrS-MOK20012-WT1-nor"  "PrS-MOK20012P-D10-nor"
##  [7] "PrS-MOK20012C-E06-nor" "PrS-MOK20012P-G09-hyp" "PrS-MOK20012-WT2-nor" 
## [10] "PrS-MOK20012C-E06-hyp" "PrS-MOK20012-WT3-hyp"  "PrS-MOK20012C-H06-hyp"
## [13] "PrS-MOK20012-WT2-hyp"  "PrS-MOK20012W-B01-nor" "PrS-MOK20012C-G07-hyp"
## [16] "PrS-MOK20012W-A03-nor" "PrS-MOK20012W-B02-nor" "PrS-MOK20012W-B01-hyp"
## [19] "PrS-MOK20012-WT3-nor"  "PrS-MOK20012C-H06-nor" "PrS-MOK20012W-A03-hyp"
## [22] "PrS-MOK20012P-G09-nor" "PrS-MOK20012P-A09-hyp" "PrS-MOK20012P-D10-hyp"
for (cur_ko_group in relevant_ko_groups){
  
  # note that gene_group is for indicating which knockout gene that WT samples are for
  # for knockout samples, its value is the knockout gene
  sample2kp = which((meta$label%in%relevant_samples)&
                    (meta$ko_group == cur_ko_group))
  cts_dat_cond = cts_dat[,sample2kp]
  meta_cond    = meta[sample2kp,]
  
  dim(cts_dat_cond)
  dim(meta_cond)
  
  unique_model_day = unique(paste0(meta_cond$model_system, "_day_", 
                                   meta_cond$timepoint))
  print("")
  print("==========================================")
  print(paste0(unique_model_day, ", ", cur_ko_group, " samples"))
  print("hyp vs nor DE results:")
  print("==========================================")
  print("")
  
  stopifnot(all(meta_cond$file_id==colnames(cts_dat_cond)))
  table(meta_cond$file_id==colnames(cts_dat_cond), useNA="ifany")
  stopifnot(all(meta_cond$ko_group==cur_ko_group))
  
  print(table(meta_cond$condition, meta_cond$run_id_short))
  
  cols2kp = c("model_system", "timepoint", "clonal.label", "condition")
  sample_info_cond = meta_cond[,cols2kp]
  colnames(sample_info_cond)[which(colnames(sample_info_cond)=="clonal.label")] = "clonal_label"
  
  dim(sample_info_cond)
  sample_info_cond[1:2,]
  
  # do two steps of filtering
  # first, filter based on q75 of each gene
  # second, filter based on whether each gene is expressed in at least 4 cells
  
  cond_q75 = apply(cts_dat_cond, 1, quantile, probs=0.75)
  cts_dat_cond = cts_dat_cond[cond_q75 > 0,]
  
  print(sprintf("After filtering by gene expression q75, the number of genes reduces from %d to %d", 
                length(cond_q75), nrow(cts_dat_cond)))
  
  n_pos = rowSums(cts_dat_cond>0)
  cts_dat_cond = cts_dat_cond[n_pos >= 4,]
  
  if (length(n_pos)==nrow(cts_dat_cond)){
    print("After filtering by nonzero expression in at least 4 samples, the number of genes does not change.")
  }else{
    print(sprintf("After filtering by nonzero expression in at least 4 samples, the number of genes reduces from %d to %d", 
                  length(n_pos), nrow(cts_dat_cond)))    
  }
  
  # update the q75 based on only genes that will be used in the DE analysis
  sample_info_cond$q75 = apply(cts_dat_cond, 2, quantile, probs = 0.75)
  sample_info_cond$log_q75 = log(sample_info_cond$q75)
  
  if (cur_ko_group=="WT_WT"){
    dds = DESeqDataSetFromMatrix(cts_dat_cond, colData=sample_info_cond, 
                                  ~ condition + log_q75)
  }else{
    dds = DESeqDataSetFromMatrix(cts_dat_cond, colData=sample_info_cond, 
                                  ~ clonal_label + log_q75 + condition)     
  }
  
  start.time <- Sys.time()
  dds = DESeq(dds)
  end.time <- Sys.time()
  
  time.taken <- end.time - start.time
  print(time.taken)
  
  ## association with read-depth
  res_rd = results(dds, name = "log_q75")
  res_rd = as.data.frame(res_rd)
  dim(res_rd)
  res_rd[1:2,]
  
  table(is.na(res_rd$padj))
  
  g0 = ggplot(res_rd %>% subset(!is.na(padj)), aes(x=pvalue)) + 
    geom_histogram(color="darkblue", fill="lightblue", 
                   breaks = seq(0,1,by=0.01)) + 
    ggtitle(paste0(unique_model_day, " ", cur_ko_group, " hyp vs nor\nread depth"))
  print(g0)
  
  ## Rerun the analysis without read-depth if it is not significant for 
  ## most genes, and the number of samples is small
  ## i.e., proportion of non-null in the distribution is smaller than 0.01
  ## (this needs to be restricted to the genes with not NA adjusted pvalue)
  ## or if smaller than 0.1 and the number of samples is not greater than 6
  
  pi_1_rd = max(0, mean(res_rd$pvalue[!is.na(res_rd$padj)] < 0.05) - 0.05)
  pi_1_rd = max(pi_1_rd, 0, 1 - 2*mean(res_rd$pvalue[!is.na(res_rd$padj)] > 0.5))
  
  print(sprintf("prop of non-null for rd: %.2e", pi_1_rd))
  
  if(pi_1_rd < 0.01 || (ncol(cts_dat_cond) <= 6 && pi_1_rd < 0.1 )){
    print("rerun DESeq2 without read depth")
    if (cur_ko_group=="WT_WT"){
      dds = DESeqDataSetFromMatrix(cts_dat_cond, colData=sample_info_cond, 
                                    ~ condition)
    }else{
      dds = DESeqDataSetFromMatrix(cts_dat_cond, colData=sample_info_cond, 
                                    ~ clonal_label + condition)  
    }
    dds = DESeq(dds)
  }
  
  ## DE analysis between two conditions
  res_g1 = results(dds, contrast = c("condition", "hyp", "nor"))
  res_g1 = signif(data.frame(res_g1), 3)
  
  # add a column to record the number of zeros per test
  n_zero = rowSums(cts_dat_cond==0)
  res_g1$n_zeros = n_zero    
  
  # record the number of samples involved in the comparison
  df_size = rbind(df_size, 
                  c(unique_model_day, 
                    cur_ko_group, 
                    sum(sample_info_cond$condition=="hyp"), 
                    sum(sample_info_cond$condition=="nor")))   
    
  # prepare a processed version of output
  res_g1_reduced = res_g1[, c("log2FoldChange", "pvalue", "padj", "n_zeros")]
  res_g1_reduced$padj[which(res_g1_reduced$n_zeros>=4)] = NA
  
  res_g1_reduced = res_g1_reduced[, c("log2FoldChange", "pvalue", "padj")]      
  
  n_DE = sum(res_g1_reduced$padj < 0.05 & abs(res_g1_reduced$log2FoldChange) > log2(1.5), 
             na.rm = TRUE)
  print(paste0(unique_model_day, " between nor and hyp conditions, ", cur_ko_group, ":"))
  print(sprintf("number of DE genes: %d", n_DE))
  
  g1 = ggplot(res_g1_reduced %>% subset(!is.na(padj)), aes(x=pvalue)) + 
    geom_histogram(color="darkblue", fill="lightblue", 
                   breaks = seq(0,1,by=0.01)) + 
    ggtitle(paste0(unique_model_day, " ", cur_ko_group, "\nhyp vs nor"))
  print(g1)
  
  tag1 = "hyp_vs_nor_"
  colnames(res_g1) = paste0(tag1, colnames(res_g1))
  colnames(res_g1_reduced) = paste0(tag1, colnames(res_g1_reduced))
  
  
  res_df = data.frame(gene_ID=rownames(res_g1), 
                      res_g1)
  dim(res_df)
  res_df[1:2,]
    
    
  res_reduced_gene_anno = gene_anno[match(rownames(res_g1_reduced), 
                                          gene_anno$ensembl_gene_id), ]
  stopifnot(all(rownames(res_g1_reduced)==res_reduced_gene_anno$ensembl_gene_id))     
    
  # set gene hgnc_symbol that equal "" to NA
  hgnc_symbol_vec = res_reduced_gene_anno$hgnc_symbol
  hgnc_symbol_vec[which(hgnc_symbol_vec=="")] = NA
    
  res_reduced_df = data.frame(gene_ID=rownames(res_g1_reduced), 
                              hgnc_symbol=hgnc_symbol_vec,
                              chr=res_reduced_gene_anno$chr, 
                              strand=res_reduced_gene_anno$strand, 
                              start=res_reduced_gene_anno$start, 
                              end=res_reduced_gene_anno$end, 
                              res_g1_reduced)
  
  print("hgnc_symbol empty string and NA tables:")
  print(table(res_reduced_df$hgnc_symbol=="", useNA="ifany"))
  print(table(is.na(res_reduced_df$hgnc_symbol)))
      
  dim(res_reduced_df)
  res_reduced_df[1:2,]      
    
  fwrite(res_df, 
         sprintf("%s/%s_%s_%s_DEseq2_raw.tsv", 
                 raw_output_dir, release_name, 
                 paste0(unique_model_day, "_", cur_ko_group), "hyp_vs_nor"), 
         sep="\t")
  fwrite(res_reduced_df, 
         sprintf("%s/%s_%s_%s_DEseq2.tsv", 
                 processed_output_dir, release_name, 
                 paste0(unique_model_day, "_", cur_ko_group), "hyp_vs_nor"), 
         sep="\t")  

}
## [1] ""
## [1] "=========================================="
## [1] "PrS_day_6, EPAS1_CE samples"
## [1] "hyp vs nor DE results:"
## [1] "=========================================="
## [1] ""
##      
##       robson-008
##   hyp          3
##   nor          3
## [1] "After filtering by gene expression q75, the number of genes reduces from 25835 to 23378"
## [1] "After filtering by nonzero expression in at least 4 samples, the number of genes reduces from 23378 to 20937"
## Time difference of 29.57336 secs

## [1] "prop of non-null for rd: 4.29e-01"
## [1] "PrS_day_6 between nor and hyp conditions, EPAS1_CE:"
## [1] "number of DE genes: 3518"

## [1] "hgnc_symbol empty string and NA tables:"
## 
## FALSE  <NA> 
## 17474  3463 
## 
## FALSE  TRUE 
## 17474  3463 
## [1] ""
## [1] "=========================================="
## [1] "PrS_day_6, EPAS1_KO samples"
## [1] "hyp vs nor DE results:"
## [1] "=========================================="
## [1] ""
##      
##       robson-008
##   hyp          3
##   nor          3
## [1] "After filtering by gene expression q75, the number of genes reduces from 25835 to 23542"
## [1] "After filtering by nonzero expression in at least 4 samples, the number of genes reduces from 23542 to 21177"
## Time difference of 1.055004 mins

## [1] "prop of non-null for rd: 0.00e+00"
## [1] "rerun DESeq2 without read depth"
## [1] "PrS_day_6 between nor and hyp conditions, EPAS1_KO:"
## [1] "number of DE genes: 2518"

## [1] "hgnc_symbol empty string and NA tables:"
## 
## FALSE  <NA> 
## 17534  3643 
## 
## FALSE  TRUE 
## 17534  3643 
## [1] ""
## [1] "=========================================="
## [1] "PrS_day_6, EPAS1_PTC samples"
## [1] "hyp vs nor DE results:"
## [1] "=========================================="
## [1] ""
##      
##       robson-008
##   hyp          3
##   nor          3
## [1] "After filtering by gene expression q75, the number of genes reduces from 25835 to 23382"
## [1] "After filtering by nonzero expression in at least 4 samples, the number of genes reduces from 23382 to 21031"
## Time difference of 54.79509 secs

## [1] "prop of non-null for rd: 0.00e+00"
## [1] "rerun DESeq2 without read depth"
## [1] "PrS_day_6 between nor and hyp conditions, EPAS1_PTC:"
## [1] "number of DE genes: 3664"

## [1] "hgnc_symbol empty string and NA tables:"
## 
## FALSE  <NA> 
## 17464  3567 
## 
## FALSE  TRUE 
## 17464  3567 
## [1] ""
## [1] "=========================================="
## [1] "PrS_day_6, WT_WT samples"
## [1] "hyp vs nor DE results:"
## [1] "=========================================="
## [1] ""
##      
##       robson-008
##   hyp          3
##   nor          3
## [1] "After filtering by gene expression q75, the number of genes reduces from 25835 to 23704"
## [1] "After filtering by nonzero expression in at least 4 samples, the number of genes reduces from 23704 to 21221"
## Time difference of 9.070153 secs

## [1] "prop of non-null for rd: 0.00e+00"
## [1] "rerun DESeq2 without read depth"
## [1] "PrS_day_6 between nor and hyp conditions, WT_WT:"
## [1] "number of DE genes: 6724"

## [1] "hgnc_symbol empty string and NA tables:"
## 
## FALSE  <NA> 
## 17531  3690 
## 
## FALSE  TRUE 
## 17531  3690
colnames(df_size) = c("model_system_day", "knockout_gene_strategy", "n_hyp", "n_nor")
fwrite(df_size, 
       sprintf("%s/%s_DE_hyp_vs_nor_n_samples.tsv", output_dir, release_name), 
       sep="\t")
gc()
##            used  (Mb) gc trigger  (Mb) limit (Mb) max used  (Mb)
## Ncells  8143434 435.0   12622100 674.1         NA 12622100 674.1
## Vcells 34191096 260.9   96789315 738.5      65536 96789315 738.5
sessionInfo()
## R version 4.2.3 (2023-03-15)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur ... 10.16
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] grid      stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] readxl_1.4.3                UpSetR_1.4.0               
##  [3] ComplexHeatmap_2.14.0       data.table_1.15.4          
##  [5] dplyr_1.1.4                 ggpointdensity_0.1.0       
##  [7] viridis_0.6.5               viridisLite_0.4.2          
##  [9] DESeq2_1.38.3               SummarizedExperiment_1.28.0
## [11] Biobase_2.58.0              MatrixGenerics_1.10.0      
## [13] matrixStats_1.3.0           GenomicRanges_1.50.2       
## [15] GenomeInfoDb_1.34.9         IRanges_2.32.0             
## [17] S4Vectors_0.36.2            BiocGenerics_0.44.0        
## [19] RColorBrewer_1.1-3          MASS_7.3-60                
## [21] stringr_1.5.1               ggpubr_0.6.0               
## [23] ggrepel_0.9.5               ggplot2_3.5.1              
## 
## loaded via a namespace (and not attached):
##  [1] bitops_1.0-8           bit64_4.0.5            doParallel_1.0.17     
##  [4] httr_1.4.7             tools_4.2.3            backports_1.5.0       
##  [7] bslib_0.8.0            utf8_1.2.4             R6_2.5.1              
## [10] DBI_1.2.3              colorspace_2.1-1       GetoptLong_1.0.5      
## [13] withr_3.0.1            tidyselect_1.2.1       gridExtra_2.3         
## [16] bit_4.0.5              compiler_4.2.3         cli_3.6.3             
## [19] Cairo_1.6-2            DelayedArray_0.24.0    labeling_0.4.3        
## [22] sass_0.4.9             scales_1.3.0           digest_0.6.37         
## [25] rmarkdown_2.28         XVector_0.38.0         pkgconfig_2.0.3       
## [28] htmltools_0.5.8.1      highr_0.11             fastmap_1.2.0         
## [31] GlobalOptions_0.1.2    rlang_1.1.4            rstudioapi_0.16.0     
## [34] RSQLite_2.3.7          farver_2.1.2           shape_1.4.6.1         
## [37] jquerylib_0.1.4        generics_0.1.3         jsonlite_1.8.8        
## [40] BiocParallel_1.32.6    car_3.1-2              RCurl_1.98-1.16       
## [43] magrittr_2.0.3         GenomeInfoDbData_1.2.9 Matrix_1.5-4.1        
## [46] Rcpp_1.0.13            munsell_0.5.1          fansi_1.0.6           
## [49] abind_1.4-5            lifecycle_1.0.4        stringi_1.8.4         
## [52] yaml_2.3.10            carData_3.0-5          zlibbioc_1.44.0       
## [55] plyr_1.8.9             blob_1.2.4             parallel_4.2.3        
## [58] crayon_1.5.3           lattice_0.22-6         Biostrings_2.66.0     
## [61] annotate_1.76.0        circlize_0.4.16        KEGGREST_1.38.0       
## [64] locfit_1.5-9.10        knitr_1.48             pillar_1.9.0          
## [67] rjson_0.2.21           ggsignif_0.6.4         geneplotter_1.76.0    
## [70] codetools_0.2-20       XML_3.99-0.17          glue_1.7.0            
## [73] evaluate_0.24.0        foreach_1.5.2          vctrs_0.6.5           
## [76] png_0.1-8              cellranger_1.1.0       gtable_0.3.5          
## [79] purrr_1.0.2            tidyr_1.3.1            clue_0.3-65           
## [82] cachem_1.1.0           xfun_0.47              xtable_1.8-4          
## [85] broom_1.0.6            rstatix_0.7.2          tibble_3.2.1          
## [88] iterators_1.0.14       AnnotationDbi_1.60.2   memoise_2.0.1         
## [91] cluster_2.1.6